In [1]:
#import usual libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm

#import library to change country names from abbreviations to the full name.
!pip install country_converter --upgrade

#import library for heat map
!pip install geopandas
import geopandas as gpd
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: country_converter in /usr/local/lib/python3.9/dist-packages (1.0.0)
Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.9/dist-packages (from country_converter) (1.4.4)
Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0->country_converter) (1.22.4)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0->country_converter) (2022.7.1)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0->country_converter) (2.8.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.8.1->pandas>=1.0->country_converter) (1.16.0)
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: geopandas in /usr/local/lib/python3.9/dist-packages (0.12.2)
Requirement already satisfied: pyproj>=2.6.1.post1 in /usr/local/lib/python3.9/dist-packages (from geopandas) (3.5.0)
Requirement already satisfied: pandas>=1.0.0 in /usr/local/lib/python3.9/dist-packages (from geopandas) (1.4.4)
Requirement already satisfied: shapely>=1.7 in /usr/local/lib/python3.9/dist-packages (from geopandas) (2.0.1)
Requirement already satisfied: fiona>=1.8 in /usr/local/lib/python3.9/dist-packages (from geopandas) (1.9.2)
Requirement already satisfied: packaging in /usr/local/lib/python3.9/dist-packages (from geopandas) (23.0)
Requirement already satisfied: cligj>=0.5 in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (0.7.2)
Requirement already satisfied: munch>=2.3.2 in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (2.5.0)
Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (6.1.0)
Requirement already satisfied: click-plugins>=1.0 in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (1.1.1)
Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (22.2.0)
Requirement already satisfied: certifi in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (2022.12.7)
Requirement already satisfied: click~=8.0 in /usr/local/lib/python3.9/dist-packages (from fiona>=1.8->geopandas) (8.1.3)
Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0.0->geopandas) (1.22.4)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0.0->geopandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas>=1.0.0->geopandas) (2022.7.1)
Requirement already satisfied: six in /usr/local/lib/python3.9/dist-packages (from munch>=2.3.2->fiona>=1.8->geopandas) (1.16.0)
Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.9/dist-packages (from importlib-metadata->fiona>=1.8->geopandas) (3.15.0)
In [2]:
#union density dataset
union = pd.read_csv('https://raw.githubusercontent.com/JulieChandlerDS/ILO_union_density/main/data-SmplN.csv')
#world happiness report 2019 (chosen because the union dataset has the most entries for 2019)
happiness = pd.read_csv('https://raw.githubusercontent.com/JulieChandlerDS/world-happiness-report/main/2019.csv')
In [3]:
#only use 2019 data since that's the most we have
union = union.loc[union['time']==2019.0]

Data Cleaning¶

In [4]:
#remove unneccesary columns from union index, we just need the obs value and the name of the country
union = union.drop(['time', 'area_time', 'Column 1','Column 1.1','Column 1.2'], axis = 1)
In [5]:
#do the same for happiness
happiness = happiness.drop(['Overall rank', 'GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption'], axis = 1)

We'll center the data. This is as simple as subtracting the mean of the score from the observation score.

$x-\bar{x}$

In [6]:
#centering data
union_mean = union['obs_value'].mean()
#happiness centering
happiness_mean = happiness['Score'].mean()
In [7]:
#centering data
union['obs_value'] = union['obs_value'] - union['obs_value'].mean()
happiness['Score'] = happiness['Score'] - happiness['Score'].mean()

Next we'll scale the data by way of means normalization. This simply means we divide by the standard deviation.

$ x' = \frac{x-\bar{x}(x)}{max(x)-min(x)}$

In [8]:
#Scaling data for happiness 
happiness['Score'] = (happiness['Score']-happiness['Score'].mean())/happiness['Score'].std()
#Scaling data for unions 
union['obs_value'] = (union['obs_value']-union['obs_value'].mean())/union['obs_value'].std()
In [9]:
happiness
Out[9]:
Country or region Score
0 Finland 2.121877
1 Denmark 1.970052
2 Norway 1.928727
3 Iceland 1.874824
4 Netherlands 1.869434
... ... ...
151 Rwanda -1.862420
152 Tanzania -1.954952
153 Afghanistan -1.980107
154 Central African Republic -2.087912
155 South Sudan -2.294538

156 rows × 2 columns

In [10]:
import country_converter as coco
happiness['Country or region'] = coco.convert(names=happiness['Country or region'], to='ISO3')
happiness
Out[10]:
Country or region Score
0 FIN 2.121877
1 DNK 1.970052
2 NOR 1.928727
3 ISL 1.874824
4 NLD 1.869434
... ... ...
151 RWA -1.862420
152 TZA -1.954952
153 AFG -1.980107
154 CAF -2.087912
155 SSD -2.294538

156 rows × 2 columns

In [11]:
happiness.rename(columns = {'Country or region':'ref_area'}, inplace = True)
In [12]:
#combine datasets
df = happiness.merge(union, how='left')
df
Out[12]:
ref_area Score obs_value
0 FIN 2.121877 2.089948
1 DNK 1.970052 2.544855
2 NOR 1.928727 1.623946
3 ISL 1.874824 3.898480
4 NLD 1.869434 -0.317729
... ... ... ...
151 RWA -1.862420 -0.844755
152 TZA -1.954952 NaN
153 AFG -1.980107 -0.240062
154 CAF -2.087912 NaN
155 SSD -2.294538 NaN

156 rows × 3 columns

In [13]:
df
Out[13]:
ref_area Score obs_value
0 FIN 2.121877 2.089948
1 DNK 1.970052 2.544855
2 NOR 1.928727 1.623946
3 ISL 1.874824 3.898480
4 NLD 1.869434 -0.317729
... ... ... ...
151 RWA -1.862420 -0.844755
152 TZA -1.954952 NaN
153 AFG -1.980107 -0.240062
154 CAF -2.087912 NaN
155 SSD -2.294538 NaN

156 rows × 3 columns

In [14]:
df.rename(columns={'Score':'Happiness'}, inplace=True)
df.rename(columns={'obs_value':'Union Density'}, inplace=True)
df.rename(columns={'ref_area':'Country'}, inplace=True)
df = df.dropna()
df
Out[14]:
Country Happiness Union Density
0 FIN 2.121877 2.089948
1 DNK 1.970052 2.544855
2 NOR 1.928727 1.623946
3 ISL 1.874824 3.898480
4 NLD 1.869434 -0.317729
6 SWE 1.739169 2.444997
9 AUT 1.652027 0.281417
11 CRI 1.581055 -0.034799
13 LUX 1.511880 0.392370
14 GBR 1.479539 0.126083
16 DEU 1.417551 -0.267800
17 BEL 1.361851 1.551827
26 GTM 0.924342 -1.000089
29 ESP 0.850676 -0.484158
30 PAN 0.821029 0.286964
31 BRA 0.802163 -0.450872
33 SGP 0.768025 0.059511
35 ITA 0.732988 0.630918
36 BHR 0.711427 -1.027827
37 SVK 0.710529 -0.556277
41 LTU 0.666508 -0.761540
42 COL 0.644947 -0.911326
51 THA 0.539837 -0.988993
52 LVA 0.478748 -0.584015
54 EST 0.436524 -0.839207
57 JPN 0.430236 -0.240062
60 BOL 0.334109 -0.506348
61 HUN 0.315244 -0.689421
62 PRY 0.301768 -0.789278
70 MDA 0.109515 -0.140204
76 DOM 0.016084 -0.755992
78 TUR -0.030631 -0.622849
82 MNG -0.109688 0.536608
83 MKD -0.119570 -0.245609
88 MAR -0.178863 -0.567373
91 IDN -0.193237 -0.450872
97 GHA -0.369319 -0.240062
101 BEN -0.470835 0.592085
105 ZAF -0.615474 0.442298
109 PSE -0.638832 0.009582
110 SEN -0.652307 -0.761540
111 SOM -0.663986 -0.517444
115 ARM -0.761909 -0.034799
118 GEO -0.797844 -0.179038
120 KEN -0.806828 -0.633944
123 TUN -0.849950 0.941586
129 LKA -0.935296 -0.611754
132 UKR -0.965840 0.858372
133 ETH -1.007166 -0.650587
137 ZMB -1.167975 0.153821
138 TGO -1.187739 0.148273
143 LSO -1.441980 -0.905779
151 RWA -1.862420 -0.844755
153 AFG -1.980107 -0.240062

Visualization¶

Scatter Plot¶

In [15]:
#scatter plot
plt.style.use('fivethirtyeight')
plt.scatter(df['Happiness'], df['Union Density'])
plt.title("correlation between happiness and union density")
plt.xlabel("happiness")
plt.ylabel("union density")
plt.show();

Histograms¶

In [16]:
plt.hist(df['Happiness'])
plt.show()
In [17]:
plt.hist(df['Union Density'])
plt.show()

Heat Map¶

In [18]:
import seaborn as sns
# Extract relevant columns from the dataframe
corr_df = df[['Union Density', 'Happiness']]

# Compute the correlation matrix
corr_matrix = corr_df.corr()

# Plot the heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)

# Add title and axis labels
plt.title('Correlation Heatmap of Union Density and GDP per capita')
plt.xlabel('Variable')
plt.ylabel('Variable')

# Show the plot
plt.show()

Machine learning¶

Pooled Linear regression¶

A linear regression seems most appropriate for our testing of one variable to cause another, since that's it's usual use.

$Y_i=f(X_i, \beta)+e_i$

In [19]:
y_var_name = 'GDP_PCAP_GWTH_PCNT'
X_var_names = ['GCF_GWTH_PCNT']
In [20]:
#linear regression
import seaborn as sns
plt.style.use('fivethirtyeight')
plt.title("correlation between happiness and union density")
plt.xlabel("happiness")
plt.ylabel("union density")
sns.regplot(data=df, y= df['Union Density'], x = df['Happiness']);
In [21]:
#pooled linear regression
import seaborn as sns
plt.style.use('fivethirtyeight')
plt.title("correlation between happiness and union density")
plt.xlabel("happiness")
plt.ylabel("union density")
sns.regplot(data=df, y= df['Union Density'], x = df['Happiness']);

K-means clustering¶

In [23]:
from sklearn.cluster import KMeans

# Extract relevant columns from the dataframe
X = df[['Union Density', 'Happiness']]

# Define number of clusters
k = 4

# Fit KMeans algorithm to the data
kmeans = KMeans(n_clusters=k, random_state=0).fit(X)

# Add a new column to the dataframe with the cluster labels
df['Cluster'] = kmeans.labels_

# Plot the clusters
plt.scatter(df['Union Density'], df['Happiness'], c=df['Cluster'])
plt.xlabel('Union Density')
plt.ylabel('GDP per capita')
plt.title('K-means clustering of union density and GDP per capita')
plt.show()
/usr/local/lib/python3.9/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
<ipython-input-23-be26cc17b93a>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Cluster'] = kmeans.labels_

Correlation or causation¶

OLS, or Ordinary least squares, gives us some estimates of values in our linear regression by minimizing the sum of squares in the line that was fit to our data.

$\hat{{\beta}}=\left(\mathbf{X}^{\top} \mathbf{X}\right)^{-1} \mathbf{X}^{\top} \mathbf{y}$

In [24]:
#OLS for union density vs happiness
np.random.seed(9876789)

X = df['Happiness']
y = df['Union Density']

model = sm.OLS(y, X, missing='drop')
results = model.fit()
results.summary()
Out[24]:
OLS Regression Results
Dep. Variable: Union Density R-squared (uncentered): 0.190
Model: OLS Adj. R-squared (uncentered): 0.175
Method: Least Squares F-statistic: 12.42
Date: Thu, 06 Apr 2023 Prob (F-statistic): 0.000886
Time: 00:21:10 Log-Likelihood: -70.649
No. Observations: 54 AIC: 143.3
Df Residuals: 53 BIC: 145.3
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Happiness 0.4025 0.114 3.524 0.001 0.173 0.632
Omnibus: 15.667 Durbin-Watson: 1.581
Prob(Omnibus): 0.000 Jarque-Bera (JB): 18.327
Skew: 1.166 Prob(JB): 0.000105
Kurtosis: 4.646 Cond. No. 1.00


Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Pooled OLS¶

In [25]:
# Define X and y variables 

y_var_name = df['Union Density']
X_var_names = df['Happiness']
In [26]:
# Carve out y variable
pooled_y= y_var_name
In [27]:
# Carve out X variable
pooled_X= X_var_names
In [28]:
# Add the placeholder for the regression intercept. When the model is fitted, the coefficient of this variable is the regression model’s intercept β_0.
pooled_X = sm.add_constant(pooled_X)
In [29]:
#Build the OLS regression model:

pooled_olsr_model = sm.OLS(endog=pooled_y, exog=pooled_X)
In [30]:
#Train the model on the (y, X) data set and fetch the training results:

pooled_olsr_model_results = pooled_olsr_model.fit()
In [31]:
#Print the training summary:
print(pooled_olsr_model_results.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:          Union Density   R-squared:                       0.197
Model:                            OLS   Adj. R-squared:                  0.182
Method:                 Least Squares   F-statistic:                     12.78
Date:                Thu, 06 Apr 2023   Prob (F-statistic):           0.000765
Time:                        00:21:10   Log-Likelihood:                -70.391
No. Observations:                  54   AIC:                             144.8
Df Residuals:                      52   BIC:                             148.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0897      0.127     -0.707      0.483      -0.344       0.165
Happiness      0.4216      0.118      3.575      0.001       0.185       0.658
==============================================================================
Omnibus:                       14.562   Durbin-Watson:                   1.596
Prob(Omnibus):                  0.001   Jarque-Bera (JB):               16.366
Skew:                           1.119   Prob(JB):                     0.000279
Kurtosis:                       4.506   Cond. No.                         1.28
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Pearson Correlation¶

The Pearson correlation coefficient finds a linear relationship between two datasets. Similar to other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship.

Note that the p-value relies on a normal distribution.

$r = \frac{\Sigma(x-m_x)(y-m_y)}{\sqrt\Sigma(x-m_x)^2\Sigma(y-m_y)^2}$

In [32]:
np.isnan(X).any()
np.isnan(y).any()

np.isinf(X).any()
np.isinf(y).any()
Out[32]:
False
In [33]:
X = np.nan_to_num(X)
y = np.nan_to_num(y)
In [34]:
sp.stats.pearsonr(X, y)
Out[34]:
PearsonRResult(statistic=0.44422304265859053, pvalue=0.0007654853445457307)